rm(list=ls())
library(plyr)
library(lubridate)
library(dplyr)
library(ggplot2)
library(ROCR)
library(ggmap)
library(rpart)
library(curl)
library(jsonlite)
library(broom)
library(rgdal)
library(scales)
Read in data and graph injuries
#d.in <- read.csv('~/Desktop/visionzero/data/final/injury_monthly_final.csv', header = TRUE)
d.in <- read.csv('~/Desktop/data_analysis/project/visionzero/data/final/injury_monthly_final.csv', header = TRUE)
#Filter out intersections w/o injuries
d.in <- d.in %>%
filter(MN != 0)
#All Injuries
g <- ggplot(data = d.in, aes(x=month(MN,label=TRUE,abbr=TRUE), y=Injuries_COUNT)) +
geom_bar(stat="identity", fill="steelblue") +
labs(x = "Month", y = "Injuries") +
ggtitle("Monthly Injuries (2016)")
plot(g)
#Pedestrian Injuries
g <- ggplot(data = d.in, aes(x=month(MN,label=TRUE,abbr=TRUE), y=PedInjurie_COUNT)) +
geom_bar(stat="identity", fill="steelblue") +
labs(x = "Month", y = "Injuries") +
ggtitle("Pedestrian Injuries (2016)")
plot(g)
#Bike Injuries
g <- ggplot(data = d.in, aes(x=month(MN,label=TRUE,abbr=TRUE), y=BikeInjuri_COUNT)) +
geom_bar(stat="identity", fill="steelblue") +
labs(x = "Month", y = "Injuries") +
ggtitle("Bike Injuries (2016)")
plot(g)
#Motor Vehicle Injuries
g <- ggplot(data = d.in, aes(x=month(MN,label=TRUE,abbr=TRUE), y=MVOInjurie_COUNT)) +
geom_bar(stat="identity", fill="steelblue") +
labs(x = "Month", y = "Injuries") +
ggtitle("Motor Vehicle Injuries (2016)")
plot(g)
#Note: We had trouble grouping data by type and making a stacked bar chart, so did it in excel.
Plot injuries on a map
d.in <- read.csv('~/Desktop/data_analysis/project/visionzero/data/final/injury_monthly_final.csv', header = TRUE)
#d.in <- read.csv('~/Desktop/visionzero/data/final/injury_monthly_final.csv', header = TRUE)
# Set a base map
nyc <- get_map(location = c(lon = -73.935, lat = 40.712), zoom = 11)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=40.712,-73.935&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
#Bike injuries by month
bike_inj <- d.in %>% filter(Injuries == 1 & BikeInjuri == 1)
g <- ggmap(nyc) + geom_point(data = bike_inj, aes(x = Lon, y = Lat, fill=month(MN,label=TRUE,abbr=TRUE)),size = 1, shape = 21) + ggtitle("Bike Injuries (2016)")
plot(g)
## Warning: Removed 51 rows containing missing values (geom_point).
#Pedestrian injuries by month
ped_inj <- d.in %>% filter(Injuries == 1 & PedInjurie == 1)
g <- ggmap(nyc) + geom_point(data = ped_inj, aes(x = Lon, y = Lat, fill=month(MN,label=TRUE,abbr=TRUE)),size = 1, shape = 21) + ggtitle("Pedestrian Injuries (2016)")
plot(g)
## Warning: Removed 274 rows containing missing values (geom_point).
#MVO injuries by month
mvo_inj <- d.in %>% filter(Injuries == 1 & MVOInjurie == 1)
g <- ggmap(nyc) + geom_point(data = mvo_inj, aes(x = Lon, y = Lat, fill=month(MN,label=TRUE,abbr=TRUE)),size = 1, shape = 21) + ggtitle("Motor Vehicle Injuries (2016)")
plot(g)
## Warning: Removed 908 rows containing missing values (geom_point).
#format dataset - turn variables into factors
d.in <- d.in %>%
mutate(Injuries = as.factor(Injuries),
PedInjurie = as.factor(PedInjurie),
BikeInjuri = as.factor(BikeInjuri),
MVOInjurie = as.factor(MVOInjurie),
bike_priority_districts = as.factor(bike_priority_districts),
enhanced_crossings = as.factor(enhanced_crossings),
left_turn_traffic_calming = as.factor(left_turn_traffic_calming),
neighborhood_slow_zones = as.factor(neighborhood_slow_zones),
leading_pedestrian_interval_signals = as.factor(leading_pedestrian_interval_signals),
signal_timing = as.factor(signal_timing),
speed_humps = as.factor(speed_humps),
safe_streets_for_seniors = as.factor(safe_streets_for_seniors),
street_improvement_projects_corridors = as.factor(street_improvement_projects_corridors),
vz_priority_corridors = as.factor(vz_priority_corridors),
vz_priority_intersections = as.factor(vz_priority_intersections),
arterial_slow_zones = as.factor(arterial_slow_zones),
MN = as.factor(MN),
#MN = as.character.numeric_version(MN),
street_improvement_projects_corridors = as.factor(street_improvement_projects_corridors),
vz_priority_zones = as.factor(vz_priority_zones))
Multiple Logstic Regressions
injury_monthly.glm <- glm(Injuries ~ MN + enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + leading_pedestrian_interval_signals + signal_timing + safe_streets_for_seniors + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.in)
## Warning: glm.fit: algorithm did not converge
#summary(injury_yearly.glm)
#exp(coef(injury_yearly.glm))
#exp(confint(injury_yearly.glm))
logistic.injurymonthly.pedinjury <- glm(PedInjurie ~ MN + arterial_slow_zones + enhanced_crossings + speed_humps + left_turn_traffic_calming + leading_pedestrian_interval_signals + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.in)
#summary(logistic.injuryyearly.pedinjury)
logistic.injurymonthly.bikeinjury <- glm(BikeInjuri ~ MN + bike_priority_districts + enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + signal_timing + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.in)
#summary(logistic.injuryyearly.bikeinjury)
logistic.injurymonthly.MVOinjurie <- glm(MVOInjurie ~ arterial_slow_zones + enhanced_crossings + speed_humps + left_turn_traffic_calming + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.in)
Build Predictive Model
#GENERAL INJURY
#split data 70-30
set.seed(123)
t_index <- sample(seq_len(nrow(d.in)), size = floor(.70*nrow(d.in)), replace = F)
d.train <- d.in[t_index, ]
d.test <- d.in[-t_index, ]
#train logistic regression take 1 (all)
m.log <- glm(Injuries ~ bike_priority_districts + enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + leading_pedestrian_interval_signals + signal_timing + safe_streets_for_seniors + street_improvement_projects_corridors + vz_priority_corridors + vz_priority_intersections + vz_priority_zones, data = d.train, family = "binomial")
a <- data.frame(exp(confint(m.log)))
## Waiting for profiling to be done...
b <- data.frame(exp(coef(m.log)))
#train logistic regression take 2 (only real safety features)
#m.log <- glm(Injuries ~ enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + leading_pedestrian_interval_signals + signal_timing + safe_streets_for_seniors + street_improvement_projects_corridors + neighborhood_slow_zones, data = d.train, family = "binomial")
#train logistic regression take 3 (only problem zones)
#m.log <- glm(Injuries ~ bike_priority_districts + vz_priority_corridors + vz_priority_intersections + vz_priority_zones, data = d.train, family = "binomial")
#predict injury
d.test$Predicted_Injury <- predict(m.log, newdata = d.test, type = "response")
#optimal cutoff seems to be 0.1
d.test$Predicted_Injury <- ifelse(d.test$Predicted_Injury >= 0.2,
1,
0)
# TP
tp <- d.test %>%
filter(Injuries == 1 & Predicted_Injury == 1) %>% nrow()
# TN
tn <- d.test %>%
filter(Injuries == 0 & Predicted_Injury == 0) %>% nrow()
# FP
fp <- d.test %>%
filter(Injuries == 0 & Predicted_Injury == 1) %>% nrow()
# FN
fn <- d.test %>%
filter(Injuries == 1 & Predicted_Injury == 0) %>% nrow()
# Sensitivity (or TPR)
sens <- tp/(tp+fn)
# Specificity ( 1 - FPR)
spec <- tn/(tn+fp)
# Accuracy
acc <- (fp+tp)/(tp+tn+fp+fn)
# FPR (1- Specificity)
1 - (tn/(tn+fp))
## [1] 0.1500634
# FNR
fn/(fn+tp)
## [1] 0.4030172
#find AUC
pred <- prediction(predictions = d.test$Predicted_Injury, labels = d.test$Injuries)
roc.perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf)
auc.perf <- performance(pred, measure = "auc")
auc.perf@y.values
## [[1]]
## [1] 0.7234597
#Decision Tree
mtree.1 <- rpart(Injuries ~ bike_priority_districts + enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + leading_pedestrian_interval_signals + signal_timing + safe_streets_for_seniors + street_improvement_projects_corridors + vz_priority_corridors + vz_priority_intersections + vz_priority_zones, data=d.train, method="class")
plot(mtree.1)
text(mtree.1, pretty = 1)
d.var_imp <-data.frame(mtree.1$variable.importance)
names(d.var_imp) <- "importance"
d.var_imp$variable <-as.factor(rownames(d.var_imp))
d.var_imp <-transform(d.var_imp, variable=reorder(variable, -importance) )
filt5 <- d.var_imp %>% top_n(-5)
## Selecting by variable
g <-ggplot(filt5,aes(x=variable, y=importance)) +
geom_bar(stat="identity")
plot(g)
#PEDESTRIAN INJURY
set.seed(123)
t_index <- sample(seq_len(nrow(d.in)), size = floor(.70*nrow(d.in)), replace = F)
d.train <- d.in[t_index, ]
d.test <- d.in[-t_index, ]
#train logistic regression
m.log <- glm(PedInjurie ~ arterial_slow_zones + enhanced_crossings + speed_humps + left_turn_traffic_calming + leading_pedestrian_interval_signals + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.test)
#predict injury
d.test$Predicted_Injury <- predict(m.log, newdata = d.test, type = "response")
#optimal cutoff seems to be 0.1
d.test$Predicted_Injury <- ifelse(d.test$Predicted_Injury >= 0.05,
1,
0)
# TP
tp <- d.test %>%
filter(Injuries == 1 & Predicted_Injury == 1) %>% nrow()
# TN
tn <- d.test %>%
filter(Injuries == 0 & Predicted_Injury == 0) %>% nrow()
# FP
fp <- d.test %>%
filter(Injuries == 0 & Predicted_Injury == 1) %>% nrow()
# FN
fn <- d.test %>%
filter(Injuries == 1 & Predicted_Injury == 0) %>% nrow()
# Sensitivity (or TPR)
sens <- tp/(tp+fn)
# Specificity ( 1 - FPR)
spec <- tn/(tn+fp)
# Accuracy
acc <- (fp+tp)/(tp+tn+fp+fn)
# FPR (1- Specificity)
1 - (tn/(tn+fp))
## [1] 0.1222029
# FNR
fn/(fn+tp)
## [1] 0.5208333
#find AUC
pred <- prediction(predictions = d.test$Predicted_Injury, labels = d.test$Injuries)
roc.perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf)
auc.perf <- performance(pred, measure = "auc")
auc.perf@y.values
## [[1]]
## [1] 0.6784819
#BIKE INJURY
set.seed(123)
t_index <- sample(seq_len(nrow(d.in)), size = floor(.70*nrow(d.in)), replace = F)
d.train <- d.in[t_index, ]
d.test <- d.in[-t_index, ]
#train logistic regression
m.log <- glm(BikeInjuri ~ enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + signal_timing + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.test)
#predict injury
d.test$Predicted_Injury <- predict(m.log, newdata = d.test, type = "response")
#optimal cutoff seems to be 0.1
d.test$Predicted_Injury <- ifelse(d.test$Predicted_Injury >= 0.03,
1,
0)
# TP
tp <- d.test %>%
filter(Injuries == 1 & Predicted_Injury == 1) %>% nrow()
# TN
tn <- d.test %>%
filter(Injuries == 0 & Predicted_Injury == 0) %>% nrow()
# FP
fp <- d.test %>%
filter(Injuries == 0 & Predicted_Injury == 1) %>% nrow()
# FN
fn <- d.test %>%
filter(Injuries == 1 & Predicted_Injury == 0) %>% nrow()
# Sensitivity (or TPR)
sens <- tp/(tp+fn)
# Specificity ( 1 - FPR)
spec <- tn/(tn+fp)
# Accuracy
acc <- (fp+tp)/(tp+tn+fp+fn)
# FPR (1- Specificity)
1 - (tn/(tn+fp))
## [1] 0.1069607
# FNR
fn/(fn+tp)
## [1] 0.5267002
#find AUC
pred <- prediction(predictions = d.test$Predicted_Injury, labels = d.test$Injuries)
roc.perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf)
auc.perf <- performance(pred, measure = "auc")
auc.perf@y.values
## [[1]]
## [1] 0.6831696
#MVO INJURY
set.seed(123)
t_index <- sample(seq_len(nrow(d.in)), size = floor(.70*nrow(d.in)), replace = F)
d.train <- d.in[t_index, ]
d.test <- d.in[-t_index, ]
#train logistic regression
m.log <- glm(MVOInjurie ~ arterial_slow_zones + enhanced_crossings + speed_humps + left_turn_traffic_calming + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.test)
#predict injury
d.test$Predicted_Injury <- predict(m.log, newdata = d.test, type = "response")
#optimal cutoff seems to be 0.1
d.test$Predicted_Injury <- ifelse(d.test$Predicted_Injury >= 0.1,
1,
0)
# TP
tp <- d.test %>%
filter(Injuries == 1 & Predicted_Injury == 1) %>% nrow()
# TN
tn <- d.test %>%
filter(Injuries == 0 & Predicted_Injury == 0) %>% nrow()
# FP
fp <- d.test %>%
filter(Injuries == 0 & Predicted_Injury == 1) %>% nrow()
# FN
fn <- d.test %>%
filter(Injuries == 1 & Predicted_Injury == 0) %>% nrow()
# Sensitivity (or TPR)
sens <- tp/(tp+fn)
# Specificity ( 1 - FPR)
spec <- tn/(tn+fp)
# Accuracy
acc <- (fp+tp)/(tp+tn+fp+fn)
# FPR (1- Specificity)
1 - (tn/(tn+fp))
## [1] 0.1192252
# FNR
fn/(fn+tp)
## [1] 0.5763889
#find AUC
pred <- prediction(predictions = d.test$Predicted_Injury, labels = d.test$Injuries)
roc.perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf)
auc.perf <- performance(pred, measure = "auc")
auc.perf@y.values
## [[1]]
## [1] 0.6521929